library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer)
library(biomaRt)
library(Rtsne)
library(ROCR) ; library(car)
library(corrplot)
library(expss) ; library(knitr)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
Load dataset (preprocessing code in 20_04_07_create_dataset.html)
clustering_selected = 'DynamicHybridMergedSmall'
print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybridMergedSmall"
# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)
# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)
rm(getinfo, mart)
rm_cluster = dataset[is.na(dataset$MTcor),'Module'] %>% unique %>% as.character
print(paste0('Removing ', sum(dataset$Module=='gray'), ' genes without cluster'))
## [1] "Removing 113 genes without cluster"
new_dataset = dataset %>% filter(Module != 'gray' & !is.na(MTcor))
Using Module Membership variables instead of binary module membership
Not including p-value variables
Including a new variable with the absolute value of GS
Removing information from gray module (unclassified genes)
Objective variable: Binary label indicating if it’s in the SFARI dataset or not
new_dataset = new_dataset %>% dplyr::select(-c(matches(paste('pval|Module')), MMgray)) %>%
mutate('absGS'=abs(GS), 'SFARI'=ifelse(gene.score=='None', FALSE, TRUE)) %>%
dplyr::select(-gene.score)
rownames(new_dataset) = rownames(dataset)[dataset$Module != 'gray']
rm(rm_cluster)
original_dataset = dataset
dataset = new_dataset
print(paste0('The final dataset contains ', nrow(dataset), ' observations and ', ncol(dataset), ' variables'))
## [1] "The final dataset contains 16034 observations and 34 variables"
rm(new_dataset)
Sample of the dataset (transposed so it’s easier to see)
dataset %>% head(5) %>% t %>% kable
| ENSG00000000003 | ENSG00000000419 | ENSG00000000457 | ENSG00000000460 | ENSG00000000938 | |
|---|---|---|---|---|---|
| MTcor | 0.5704012 | 0.2525473 | -0.9514071 | -0.8039747 | 0.8771832 |
| GS | 0.3866600 | 0.0319742 | -0.3948272 | -0.2425891 | 0.4206897 |
| MM.B79F00 | -0.3421536 | -0.0685993 | 0.1822384 | -0.0203122 | 0.0401878 |
| MM.FE6E8A | -0.0815013 | -0.2406456 | -0.0314884 | -0.1149264 | 0.2532996 |
| MM.00BF7D | -0.1457984 | -0.1570793 | 0.2895552 | 0.2146387 | -0.1582971 |
| MM.D376FF | -0.1558365 | -0.3275203 | 0.2216473 | 0.1076922 | -0.1267206 |
| MM.00B7E9 | -0.2706076 | 0.1086927 | 0.2716955 | 0.3287583 | -0.1127668 |
| MM.00BD5F | -0.2103137 | -0.0739132 | 0.1588410 | 0.1164722 | -0.1165457 |
| MM.00BA38 | -0.2073096 | -0.2063582 | 0.1423063 | -0.0137954 | -0.2407892 |
| MM.FF62BC | 0.0881871 | -0.0729658 | -0.0536479 | -0.1519834 | 0.0208935 |
| MM.619CFF | -0.0278009 | -0.3888226 | 0.1529542 | 0.0699100 | -0.0311339 |
| MM.EF7F49 | -0.0743495 | -0.3953383 | 0.1857405 | -0.1853277 | -0.0511009 |
| MM.F8766D | 0.0320566 | -0.2755247 | -0.0473936 | -0.2555324 | -0.0717451 |
| MM.00C0AF | -0.3240023 | -0.0804812 | 0.4041119 | 0.1597938 | -0.4479131 |
| MM.B983FF | -0.1398423 | -0.1306399 | 0.2955196 | 0.0852169 | -0.3261794 |
| MM.F564E3 | -0.0472562 | -0.0021517 | 0.1666066 | -0.0218248 | -0.2134750 |
| MM.FD61D1 | -0.2102549 | 0.3544128 | 0.2690348 | 0.4502318 | -0.4636712 |
| MM.8AAB00 | 0.1951790 | 0.3820330 | -0.0471786 | 0.2444100 | -0.1238171 |
| MM.C99800 | 0.1463015 | 0.5008550 | -0.0892625 | 0.3573500 | -0.0912000 |
| MM.00BFC4 | -0.0419816 | 0.0223852 | -0.0618101 | -0.0495563 | 0.2941168 |
| MM.E58700 | 0.1436711 | -0.0157220 | -0.3400724 | -0.2724479 | 0.1215078 |
| MM.FF67A4 | 0.2300837 | -0.1215257 | -0.1906585 | -0.3770794 | 0.1655206 |
| MM.00A7FF | -0.1508215 | 0.1640578 | 0.0744979 | 0.1972789 | -0.1825727 |
| MM.6BB100 | -0.1412895 | 0.0441975 | 0.0625034 | 0.2417779 | 0.0799198 |
| MM.00C097 | 0.1450441 | 0.0257528 | -0.2382952 | -0.0953965 | 0.2989640 |
| MM.39B600 | 0.3292611 | -0.0759346 | -0.3095235 | -0.3958098 | 0.2579894 |
| MM.9590FF | 0.2281574 | 0.1383876 | -0.2008810 | 0.0354509 | 0.3490105 |
| MM.A3A500 | 0.2008548 | 0.0853587 | -0.1824296 | -0.1208052 | 0.2633420 |
| MM.00BCD8 | 0.0804575 | -0.0263322 | -0.2410231 | 0.0801394 | 0.0722277 |
| MM.E76BF3 | 0.4768465 | -0.1831555 | -0.1360703 | -0.2163761 | 0.1917615 |
| MM.00B0F6 | 0.3884997 | 0.1582543 | -0.3053931 | -0.2093428 | 0.3554336 |
| MM.D89000 | 0.1285911 | 0.1002678 | -0.1304555 | 0.1913294 | 0.0942191 |
| absGS | 0.3866600 | 0.0319742 | 0.3948272 | 0.2425891 | 0.4206897 |
| SFARI | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 |
Objective variable distribution: Unbalanced labels
print(table(dataset$SFARI))
##
## FALSE TRUE
## 15137 897
cat(paste0('\n',round(mean(dataset$SFARI)*100,2), '% of the observations are positive'))
##
## 5.59% of the observations are positive
Chose the t-SNE algorithm because it preserves distances
The SFARI labels is still close to the absolute value of Gene Significance. This time the MM variables seem to be grouped in 2 clusters
tsne = dataset %>% t %>% Rtsne(perplexity=10)
plot_data = data.frame('ID'=colnames(dataset), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2],
type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))
ggplotly(plot_data %>% ggplot(aes(C1, C2, color=type)) + geom_point(aes(id=ID)) +
theme_minimal() + ggtitle('t-SNE visualisation of variables'))
The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and modules with low correlation far away from the SFARI tag
mtcor_by_module = original_dataset %>% dplyr::select(Module, MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')
plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')
ggplotly(plot_data %>% ggplot(aes(C1, C2, color=MTcor)) + geom_point(aes(id=ID)) +
scale_color_viridis() + theme_minimal() +
ggtitle('t-SNE of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, tsne)
The two main patterns that seem to characterise the genes are their Gene Significance and the Module-Diagnosis correlation of their corresponding module
Mean Expression doesn’t seem to play an important role
I don’t know what the 2nd PC is capturing
# Mean Expression data
load('./../Data/preprocessed_data.RData')
## Registered S3 methods overwritten by 'Hmisc':
## method from
## [.labelled expss
## print.labelled expss
## as.data.frame.labelled expss
datExpr = datExpr %>% data.frame
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))
# PCA
pca = dataset %>% t %>% prcomp
plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2],
'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')
p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.4) + scale_color_viridis() +
theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
theme(legend.position='bottom')
p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.4) + scale_color_viridis() +
theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')
p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(alpha = plot_data$alpha) +
theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)
p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.4) + scale_color_viridis() +
theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')
grid.arrange(p1, p2, p3, p4, nrow=2)
rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p1, p2, p3, p4)
For now, will do this using over- and under-sampling of the classes, but later on should check SMOTE (Synthetic Minority Over-sampling Technique) method
Need to divide first into train and test sets to keep the sets independent: using 80% of the Positive observations on the training set
Note: Even though our label is binary, I want to have representative samples for all SFARI scores in both the training and test data, so instead of pooling all the SFARI scores together and randomly selecting 80% of the samples, I’m going to create the positive set selecting 80% of each of the samples by score
set.seed(123)
positive_sample_balancing_SFARI_scores = function(p){
positive_train_idx = c()
positive_test_idx = c()
for(score in 1:6){
score_genes = rownames(original_dataset)[rownames(original_dataset) %in% rownames(dataset) & original_dataset$gene.score == score]
score_idx = which(rownames(dataset) %in% score_genes)
score_train_idx = sample(score_idx, size = ceiling(p*length(score_idx)))
score_test_idx = score_idx[!score_idx %in% score_train_idx]
positive_train_idx = c(positive_train_idx, score_train_idx)
positive_test_idx = c(positive_test_idx, score_test_idx)
}
return(list('train' = sort(positive_train_idx), 'test' = sort(positive_test_idx)))
}
# 80% of the samples for the training set
p = 0.8
positive_idx = positive_sample_balancing_SFARI_scores(p)
positive_train_idx = positive_idx[['train']]
positive_test_idx = positive_idx[['test']]
negative_idx = which(!dataset$SFARI)
negative_train_idx = sort(sample(negative_idx, size=ceiling(p*length(negative_idx))))
negative_test_idx = negative_idx[!negative_idx %in% negative_train_idx]
train_set = dataset[c(positive_train_idx, negative_train_idx),]
test_set = dataset[c(positive_test_idx, negative_test_idx),]
rm(positive_idx, negative_idx, positive_train_idx, positive_test_idx, negative_train_idx, negative_test_idx,
p, positive_sample_balancing_SFARI_scores)
Over-sampling observations with positive SFARI label: Sample with replacement 4x original number of observations
Sample with replacement positive observations in train set
positive_obs = which(train_set$SFARI)
add_obs = sample(positive_obs, size=3*length(positive_obs), replace=TRUE)
train_set = train_set[c(1:nrow(train_set), add_obs),]
rm(positive_obs, add_obs)
Under-sampling observations with negative SFARI labels
print(paste0('Keeping ~',round(100*sum(train_set$SFARI)/sum(!train_set$SFARI)),
'% of the Negative observations in the training set'))
## [1] "Keeping ~24% of the Negative observations in the training set"
negative_obs = which(!train_set$SFARI)
keep_obs = sample(negative_obs, size=sum(train_set$SFARI))
train_set = train_set[c(keep_obs, which(train_set$SFARI)),]
rm(negative_obs, keep_obs)
Label distribution in training set
cro(train_set$SFARI)
|  #Total | |
|---|---|
| Â train_set$SFARIÂ | |
| Â Â Â FALSEÂ | 2880 |
| Â Â Â TRUEÂ | 2880 |
|    #Total cases | 5760 |
Labels distribution in test set
cro(test_set$SFARI)
|  #Total | |
|---|---|
| Â test_set$SFARIÂ | |
| Â Â Â FALSEÂ | 3027 |
| Â Â Â TRUEÂ | 177 |
|    #Total cases | 3204 |
Train model
train_set$SFARI = train_set$SFARI %>% as.factor
fit = glm(SFARI~., data=train_set, family='binomial')
Predict labels in test set
test_set$prob = predict(fit, newdata=test_set, type='response')
test_set$pred = test_set$prob>0.5
Confusion matrix
conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels',
prob = 'Assigned Probability',
pred = 'Label Prediction')
cro(conf_mat$SFARI, list(conf_mat$pred, total()))
|  Label Prediction |  #Total | |||
|---|---|---|---|---|
| Â FALSEÂ | Â TRUEÂ | Â | ||
|  Actual Labels | ||||
| Â Â Â FALSEÂ | 1982 | 1045 | Â | 3027 |
| Â Â Â TRUEÂ | 76 | 101 | Â | 177 |
|    #Total cases | 2058 | 1146 |  | 3204 |
rm(conf_mat)
acc = mean(test_set$SFARI==test_set$pred)
print(paste0('Accuracy = ', round(acc,4)))
## [1] "Accuracy = 0.6501"
rm(acc)
pred_ROCR = prediction(test_set$prob, test_set$SFARI)
roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(AUC,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')
lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')
rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)
SFARI genes have a slightly higher score distribution than the rest
plot_data = test_set %>% dplyr::select(prob, SFARI)
plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
theme_minimal() + ggtitle('Model score distribution by SFARI Label')
There seems to be a positie relation between the SFARI scores and the probability assigned by the model
The number of observations when separating the test set by SFARI score is quite small, so this is not a robust result, specially for scores 1, 2 and 6
plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')
cro(plot_data$gene.score)
|  #Total | |
|---|---|
|  SFARI Gene score | |
| Â Â Â 1Â | 5 |
| Â Â Â 2Â | 12 |
| Â Â Â 3Â | 38 |
| Â Â Â 4Â | 86 |
| Â Â Â 5Â | 32 |
| Â Â Â 6Â | 4 |
|    None | 3027 |
|    #Total cases | 3204 |
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))
# plot_data %>% ggplot(aes(prob, color=gene.score, fill=gene.score)) + geom_density(alpha=0.25) +
# geom_vline(data=mean_vals, aes(xintercept=mean_prob, color=gene.score), linetype='dashed') +
# scale_colour_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
# scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
# ggtitle('Distribution of probabilities by SFARI score') +
# xlab('Probability') + ylab('Density') + theme_minimal()
ggplotly(plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
ggtitle('Distribution of probabilities by SFARI score') +
xlab('SFARI score') + ylab('Probability') + theme_minimal())
rm(mean_vals)
Any variable with a VIF above 10 is considered to have strong multicollinearity: the dataset has a really big problem with multicollinearity :/
Multicollinearity affects the coefficients and p-values of the regression, but it doesn’t affect the predictions, precision of the predictions or the goodness-of-fit statistics ref. So we cannot analyse the coefficients to see which features are the most important, but all the results from this analysis are valid
# VIF
car::vif(fit) %>% sort
## absGS MTcor MM.00BCD8 MM.00BFC4 MM.B79F00 MM.E76BF3
## 1.120003 3.322214 5.760965 11.977660 28.704800 30.453238
## MM.619CFF MM.E58700 MM.D89000 MM.FF67A4 MM.39B600 MM.A3A500
## 40.019097 43.341694 48.009397 55.717881 74.444993 90.507830
## GS MM.00BF7D MM.8AAB00 MM.00A7FF MM.F8766D MM.00BD5F
## 96.406577 104.269895 124.383852 132.186820 185.578598 196.779668
## MM.EF7F49 MM.FF62BC MM.B983FF MM.FD61D1 MM.FE6E8A MM.C99800
## 217.997495 244.733566 267.532403 289.004766 293.298020 311.423166
## MM.9590FF MM.00B7E9 MM.D376FF MM.00BA38 MM.00B0F6 MM.00C0AF
## 345.026524 354.944339 367.784988 397.850985 423.554841 476.163259
## MM.6BB100 MM.F564E3 MM.00C097
## 494.468875 550.127905 1003.865351
# Correlation plot
corrplot.mixed(cor(train_set[,-ncol(train_set)]), lower = 'number', lower.col = 'gray', number.cex = .6, tl.pos = 'l', tl.col = '#666666')
3 of the 5 genes with SFARI score of 1 are in the top 6 genes
Considering the ratio of 3021:177, there are a lot of genes with a SFARI score in the top genes
There’s not a single gene with a SFARI score of 5 or 6
test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>%
arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 'ModuleDiagnosis_corr' = MTcor) %>%
mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr,4), Probability = round(Probability,4)) %>%
dplyr::select(GeneSymbol, gene.score, ModuleDiagnosis_corr, Module, Probability) %>%
kable(caption = 'Genes with highest model probabilities from the test set')
| GeneSymbol | gene.score | ModuleDiagnosis_corr | Module | Probability |
|---|---|---|---|---|
| ARID1B | 1 | 0.1127 | #FF62BC | 0.8693 |
| TLN2 | None | -0.9514 | #00C0AF | 0.8516 |
| NAV1 | None | -0.6031 | #00BA38 | 0.8464 |
| KMT2A | 1 | 0.7916 | #00C097 | 0.8447 |
| PDS5B | None | -0.9514 | #00C0AF | 0.8424 |
| SCN2A | 1 | -0.9514 | #00C0AF | 0.8416 |
| EIF4G3 | None | -0.8040 | #00B7E9 | 0.8383 |
| EP400 | 3 | -0.6031 | #00BA38 | 0.8366 |
| BAZ2A | None | -0.0094 | #00A7FF | 0.8364 |
| ATN1 | None | -0.6031 | #00BA38 | 0.8354 |
| BICD1 | None | 0.0586 | #FE6E8A | 0.8340 |
| AFF3 | None | 0.0586 | #FE6E8A | 0.8324 |
| KIAA0226 | None | -0.6031 | #00BA38 | 0.8312 |
| RIMBP2 | None | -0.0094 | #00A7FF | 0.8265 |
| DAAM1 | None | -0.0094 | #00A7FF | 0.8259 |
| CERS6 | None | -0.9514 | #00C0AF | 0.8247 |
| NFIC | None | -0.6031 | #00BA38 | 0.8243 |
| PTPRS | None | -0.2526 | #F564E3 | 0.8228 |
| SCAF4 | None | -0.6031 | #00BA38 | 0.8166 |
| FRMPD4 | None | -0.9514 | #00C0AF | 0.8159 |
| PAK2 | 3 | 0.1127 | #FF62BC | 0.8146 |
| ZNF385A | None | -0.6031 | #00BA38 | 0.8138 |
| JAZF1 | None | -0.9514 | #00C0AF | 0.8131 |
| PLXNC1 | None | -0.0094 | #00A7FF | 0.8122 |
| TET3 | None | 0.7287 | #39B600 | 0.8119 |
| RAB7A | None | -0.6031 | #00BA38 | 0.8105 |
| GRIN2A | 3 | -0.9514 | #00C0AF | 0.8102 |
| SF3A1 | None | -0.6031 | #00BA38 | 0.8101 |
| DLGAP1 | 3 | -0.9514 | #00C0AF | 0.8091 |
| USP32 | None | -0.6031 | #00BA38 | 0.8086 |
| RASAL2 | None | -0.4891 | #00BF7D | 0.8080 |
| MEF2D | None | -0.6031 | #00BA38 | 0.8080 |
| AFAP1 | None | 0.1127 | #FF62BC | 0.8073 |
| MAP3K13 | None | -0.6750 | #D376FF | 0.8047 |
| KMT2E | 3 | 0.7916 | #00C097 | 0.8043 |
| TMEM178B | None | -0.9514 | #00C0AF | 0.8035 |
| TENM1 | None | 0.7916 | #00C097 | 0.8027 |
| KSR2 | None | -0.6031 | #00BA38 | 0.8025 |
| WNK1 | None | 0.2213 | #A3A500 | 0.8009 |
| UBAP2L | None | -0.6031 | #00BA38 | 0.8004 |
| MARK4 | None | -0.6031 | #00BA38 | 0.7988 |
| SLC32A1 | None | -0.6031 | #00BA38 | 0.7981 |
| BRD4 | 4 | -0.6031 | #00BA38 | 0.7979 |
| SEZ6L | None | -0.9514 | #00C0AF | 0.7978 |
| PPP1R12B | None | -0.6031 | #00BA38 | 0.7968 |
| PRPF8 | None | -0.6031 | #00BA38 | 0.7945 |
| NEDD4L | None | 0.1127 | #FF62BC | 0.7942 |
| GRIK5 | 3 | -0.6031 | #00BA38 | 0.7938 |
| RAD50 | None | -0.0094 | #00A7FF | 0.7936 |
| CLASP1 | 3 | -0.4946 | #B79F00 | 0.7898 |
Running the model on all non-SFARI genes (excluding the ones in the train set)
negative_set = dataset %>% filter(!SFARI & !rownames(.) %in% rownames(train_set)) %>% dplyr::select(-SFARI)
rownames(negative_set) = rownames(dataset)[!dataset$SFARI & !rownames(dataset) %in% rownames(train_set)]
negative_set$prob = predict(fit, newdata=negative_set, type='response')
negative_set$pred = negative_set$prob>0.5
negative_set_table = negative_set %>% apply_labels(prob = 'Assigned Probability',
pred = 'Label Prediction')
cro(negative_set_table$pred)
|  #Total | |
|---|---|
|  Label Prediction | |
| Â Â Â FALSEÂ | 8091 |
| Â Â Â TRUEÂ | 4166 |
|    #Total cases | 12257 |
cat(paste0('\n', sum(negative_set$pred), ' genes are predicted as ASD-related'))
##
## 4166 genes are predicted as ASD-related
negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
geom_vline(xintercept=0.5, color='#333333', linetype='dotted') +
ggtitle('Probability distribution of all the Negative samples in the dataset') +
theme_minimal()
There’s a lot of noise, but the genes with the highest probabilities have slightly higher (absolute) Gene Significance
negative_set %>% ggplot(aes(prob, GS, color=MTcor)) + geom_point() + geom_smooth(method='loess', color='#666666') +
geom_hline(yintercept=0, color='gray', linetype='dashed') +
scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
ggtitle('Relation between Probability and Gene Significance') + theme_minimal()
negative_set %>% ggplot(aes(prob, abs(GS), color=MTcor)) + geom_point() +
geom_hline(yintercept=mean(negative_set$absGS), color='gray', linetype='dashed') +
geom_smooth(method='loess', color='#666666') +
scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
ggtitle('Relation between Model probability and Gene Significance') + theme_minimal()
On average, the model seems to be assigning a probability inversely proportional to the Module-Diagnosis correlation of the module, with the highest positively correlated modules having the lowest average probability and the highest negatively correlated modules the highest average probability. But the difference isn’t big
negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() + geom_smooth(method='loess', color='#666666') +
geom_hline(yintercept=mean(negative_set$prob), color='gray', linetype='dashed') +
scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) +
xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
theme_minimal()
Summarised version, plotting by module instead of by gene
The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same
The model seems to give higher probabilities to genes belonging to modules with a small (absolute) correlation to Diagnosis, although the difference isn’t much
plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% left_join(original_dataset, by='MTcor') %>%
dplyr::select(Module, MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'
ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID)) +
geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) +
xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Model') + theme_minimal())
There is a positive relation between level of expression and probability, the model seems to be capturing indirectly the level of expression of the genes to make the prediction, so it’s introducing the same bias
# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))
plot_data = negative_set %>% mutate(ID=rownames(negative_set)) %>% left_join(mean_and_sd, by='ID') %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>%
dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'
plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.2, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) +
geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) +
theme_minimal() + ggtitle('Mean expression vs model probability by gene')
rm(mean_and_sd)
plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), n=n())
ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) + geom_point(color=plot_data2$Module) +
geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) +
geom_smooth(method='lm', se=FALSE, color='gray') + theme_minimal() + theme(legend.position='none') +
ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)
There is also a positive relation between the standard deviation of a gene and its regression score, the model could be capturing this characteristic of the genes to make the prediction, and could be introducing bias
plot_data %>% filter(sdExpr<0.5) %>% ggplot(aes(sdExpr, prob)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.2) +
geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) +
theme_minimal() + ggtitle('SD vs model probability by gene')
This approximation curve looks like the opposite of the trend found between mean/sd and model scores
plot_data %>% ggplot(aes(meanExpr, sdExpr)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) +
geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) +
scale_x_log10() + scale_y_log10() +
theme_minimal() + ggtitle('Mean expression vs SD by gene')
There is a relation between probability and lfc, so it IS capturing a bit of true information (because lfc and mean expression were negatively correlated and it still has a positive relation in the model)
plot_data = negative_set %>% mutate(ID=rownames(negative_set)) %>%
left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')
plot_data %>% filter(abs(log2FoldChange)<10) %>%
ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) +
theme_minimal() + ggtitle('lfc vs model probability by gene')
The model is capturing the mean level of expression of the genes (indirectly through module memberhsip), which is a strong bias found in the SFARI scores, but it seems to be capturing a bit of true biological signal as well (based on the GS and the log fold change plots)
save(train_set, test_set, negative_set, fit, dataset, file='./../Data/LR_model.RData')
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] expss_0.10.2 corrplot_0.84 car_3.0-7 carData_3.0-3
## [5] ROCR_1.0-7 gplots_3.0.3 Rtsne_0.15 biomaRt_2.40.5
## [9] RColorBrewer_1.1-2 gridExtra_2.3 viridis_0.5.1 viridisLite_0.3.0
## [13] plotly_4.9.2 knitr_1.28 forcats_0.5.0 stringr_1.4.0
## [17] dplyr_0.8.5 purrr_0.3.3 readr_1.3.1 tidyr_1.0.2
## [21] tibble_3.0.0 ggplot2_3.3.0 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.5
## [3] Hmisc_4.4-0 lazyeval_0.2.2
## [5] splines_3.6.3 BiocParallel_1.18.1
## [7] crosstalk_1.1.0.1 GenomeInfoDb_1.20.0
## [9] digest_0.6.25 htmltools_0.4.0
## [11] gdata_2.18.0 fansi_0.4.1
## [13] magrittr_1.5 checkmate_2.0.0
## [15] memoise_1.1.0 cluster_2.1.0
## [17] openxlsx_4.1.4 annotate_1.62.0
## [19] modelr_0.1.6 matrixStats_0.56.0
## [21] prettyunits_1.1.1 jpeg_0.1-8.1
## [23] colorspace_1.4-1 blob_1.2.1
## [25] rvest_0.3.5 haven_2.2.0
## [27] xfun_0.12 crayon_1.3.4
## [29] RCurl_1.98-1.1 jsonlite_1.6.1
## [31] genefilter_1.66.0 survival_3.1-11
## [33] glue_1.3.2 gtable_0.3.0
## [35] zlibbioc_1.30.0 XVector_0.24.0
## [37] DelayedArray_0.10.0 BiocGenerics_0.30.0
## [39] abind_1.4-5 scales_1.1.0
## [41] DBI_1.1.0 miniUI_0.1.1.1
## [43] Rcpp_1.0.4 xtable_1.8-4
## [45] progress_1.2.2 htmlTable_1.13.3
## [47] foreign_0.8-75 bit_1.1-15.2
## [49] Formula_1.2-3 stats4_3.6.3
## [51] htmlwidgets_1.5.1 httr_1.4.1
## [53] acepack_1.4.1 ellipsis_0.3.0
## [55] pkgconfig_2.0.3 XML_3.99-0.3
## [57] farver_2.0.3 nnet_7.3-13
## [59] dbplyr_1.4.2 locfit_1.5-9.4
## [61] later_1.0.0 tidyselect_1.0.0
## [63] labeling_0.3 rlang_0.4.5
## [65] AnnotationDbi_1.46.1 munsell_0.5.0
## [67] cellranger_1.1.0 tools_3.6.3
## [69] cli_2.0.2 generics_0.0.2
## [71] RSQLite_2.2.0 broom_0.5.5
## [73] fastmap_1.0.1 evaluate_0.14
## [75] yaml_2.2.1 bit64_0.9-7
## [77] fs_1.4.0 zip_2.0.4
## [79] caTools_1.18.0 nlme_3.1-144
## [81] mime_0.9 ggExtra_0.9
## [83] xml2_1.2.5 compiler_3.6.3
## [85] rstudioapi_0.11 curl_4.3
## [87] png_0.1-7 reprex_0.3.0
## [89] geneplotter_1.62.0 stringi_1.4.6
## [91] highr_0.8 lattice_0.20-40
## [93] Matrix_1.2-18 vctrs_0.2.4
## [95] pillar_1.4.3 lifecycle_0.2.0
## [97] data.table_1.12.8 bitops_1.0-6
## [99] httpuv_1.5.2 GenomicRanges_1.36.1
## [101] R6_2.4.1 latticeExtra_0.6-29
## [103] promises_1.1.0 KernSmooth_2.23-16
## [105] rio_0.5.16 IRanges_2.18.3
## [107] gtools_3.8.2 assertthat_0.2.1
## [109] SummarizedExperiment_1.14.1 DESeq2_1.24.0
## [111] withr_2.1.2 S4Vectors_0.22.1
## [113] GenomeInfoDbData_1.2.1 mgcv_1.8-31
## [115] parallel_3.6.3 hms_0.5.3
## [117] grid_3.6.3 rpart_4.1-15
## [119] rmarkdown_2.1 Biobase_2.44.0
## [121] shiny_1.4.0.2 lubridate_1.7.4
## [123] base64enc_0.1-3